# The package "ezids" (EZ Intro to Data Science) includes a lot of the helper functions we developed for the course.
# Importing the necessary libraries
library(ezids)
library(dplyr)
library(ggplot2)
library(DT)
library(corrplot)
library(lubridate)
library(tidyr)
library(scales)
library(cluster)
library(knitr)
library(kableExtra)
library(caret)
library(xgboost)
library(pROC)
library(e1071)
library(rpart)
library(rpart.plot)
library(randomForest)
Our research focuses on exploring the featk
“Which are the top 5 app categories, as identified by classification models (logistic regression,SVM, XGBoost, KNN, and random forest), that significantly influenced app success (measured by installs) based on app data from 2010 to 2018, and how accurately can these models predict success trends within this time period??”
Here, we have loaded the dataset ‘Google Play Store Apps’ stored in csv file using ()
#Loading the Dataset
data_apps <- data.frame(read.csv("googleplaystore.csv"))
#Dropping columns
data_apps <- data_apps[, !(colnames(data_apps) %in% c("Android.Ver", "Current.Ver", "Type", "App", "Genres"))]
##Price----
### Convertion of Price to numerical
data_apps$Price <- as.numeric(gsub("\\$", "", data_apps$Price)) #--Remove dollar symbols
missing_na <- is.na(data_apps$Price)
missing_blank <- data_apps$Price == ""
data_apps <- data_apps[!is.na(data_apps$Price) & data_apps$Price != "", ] #-- Remove price is NA or blank
## Size---
#### Replacing Missing values with the mean (Size)
# Replace "Varies with Device" in the Size column with NA
data_apps$Size[data_apps$Size == "Varies with device"] <- NA #"Varies with Device" to NA
data_apps <- data_apps[!grepl("\\+", data_apps$Size), ]
data_apps$Size <- ifelse(grepl("k", data_apps$Size),
as.numeric(gsub("k", "", data_apps$Size)) *
0.001, # Convert "K" to MB
as.numeric(gsub("M", "", data_apps$Size))) # Remove "M" for megabytes
# Calculate and display the mean size for each category in the 'Type' column
mean_size_by_type <- tapply(data_apps$Size, data_apps$Category,
mean, na.rm = TRUE)
# Loop through each row and replace NA values in the Size column with the mean size of the corresponding category
data_apps$Size <- ifelse(is.na(data_apps$Size), # Check if Size is NA
round(mean_size_by_type[data_apps$Category], 1), # Replace with the mean size based on the Category
data_apps$Size) # Keep the original size if it's not NA
##Installs---
####Remove the '+' sign, Remove the commas, Convert to numeric
#clean installations
clean_installs <- function(Installs) {
Installs <- gsub("\\+", "", Installs)
Installs <- gsub(",", "", Installs)
return(as.numeric(Installs))
}
data_apps$Installs <- sapply(data_apps$Installs, clean_installs)
nan_rows <- sapply(data_apps[, c("Size", "Installs")], function(x) any(is.nan(x)))
## Rating ---
data_apps <- data_apps %>%
mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))
# Identify the unique values in the 'Installs' column
unique_values <- unique(data_apps$Installs)
# Function to convert the installs to numeric
convert_to_numeric <- function(x) {
# Remove non-numeric characters and convert to numeric
as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]
#Reviews---
data_apps$Reviews <- as.numeric(data_apps$Reviews)#Replace NA in Ratings with Overall Mean
data_apps <- data_apps %>%
mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))
#Content rating---
data_apps <- data_apps %>%
mutate(
Content.Rating = as.factor(Content.Rating)
)
data_apps$Content.Rating <- as.numeric(data_apps$Content.Rating)
#### Preprocessing for a model
#categories----
category_dummies <- model.matrix(~ Category - 1, data = data_apps)
colnames(category_dummies) <- gsub("Category", "cat", colnames(category_dummies))
# 3. Add dummy variables to the dataset and remove the original 'Category' column
data_apps <- cbind(data_apps, category_dummies)
data_apps$Category <- NULL
# 4. Replace spaces in column names with underscores
colnames(data_apps) <- gsub(" ", "_", colnames(data_apps))
#### Installs----
# Load necessary libraries
# Create two categories: Low Installs and High Installs
# Calculate the median of Installs to split into two categories
median_installs <- median(data_apps$Installs, na.rm = TRUE)
#Reclassify into two categories
data_apps$Installs_Category <- ifelse(data_apps$Installs <= median_installs, "Low Installs", "High Installs")
# Convert 'Installs_Category' to factor with levels "Low Installs" and "High Installs"
data_apps$Installs_Category <- factor(data_apps$Installs_Category,
levels = c("Low Installs", "High Installs"),
labels = c(0, 1))
# Check the conversion
table(data_apps$Installs_Category)
##
## 0 1
## 5890 4950
# Create a histogram for the new categories
ggplot(data_apps, aes(x = Installs_Category)) +
geom_bar(stat = "count", fill = "skyblue", color = "black") +
labs(title = "Histogram of Installs Category (Low vs High)",
x = "Installs Category",
y = "Count") +
theme_minimal()
## Last updated----
# Convert the 'last_updated' column to Date type
data_apps$Last.Updated <- as.Date(data_apps$Last.Updated, format = "%B %d, %Y")
# Calculate the difference in days between the maximum date and each date in 'last_updated'
data_apps$Last.Updated <- as.numeric(difftime(max(data_apps$Last.Updated, na.rm = TRUE),
data_apps$Last.Updated,
units = "days"))
# Display summary using kable
summary(data_apps) %>%
kable(caption = "Summary of Google Play Store Apps") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
scroll_box(width = "100%", height = "400px")
| Rating | Reviews | Size | Installs | Price | Content.Rating | Last.Updated | catART_AND_DESIGN | catAUTO_AND_VEHICLES | catBEAUTY | catBOOKS_AND_REFERENCE | catBUSINESS | catCOMICS | catCOMMUNICATION | catDATING | catEDUCATION | catENTERTAINMENT | catEVENTS | catFAMILY | catFINANCE | catFOOD_AND_DRINK | catGAME | catHEALTH_AND_FITNESS | catHOUSE_AND_HOME | catLIBRARIES_AND_DEMO | catLIFESTYLE | catMAPS_AND_NAVIGATION | catMEDICAL | catNEWS_AND_MAGAZINES | catPARENTING | catPERSONALIZATION | catPHOTOGRAPHY | catPRODUCTIVITY | catSHOPPING | catSOCIAL | catSPORTS | catTOOLS | catTRAVEL_AND_LOCAL | catVIDEO_PLAYERS | catWEATHER | Installs_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :1.000 | Min. : 0 | Min. : 0.0085 | Min. :0.000e+00 | Min. : 0.000 | Min. :1.000 | Min. : 0.0 | Min. :0.000000 | Min. :0.000000 | Min. :0.000000 | Min. :0.00000 | Min. :0.00000 | Min. :0.000000 | Min. :0.0000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.000000 | Min. :0.0000 | Min. :0.00000 | Min. :0.00000 | Min. :0.0000 | Min. :0.00000 | Min. :0.000000 | Min. :0.000000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.000000 | Min. :0.00000 | Min. :0.0000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :0.0000 | Min. :0.00000 | Min. :0.000000 | 0:5890 | |
| 1st Qu.:4.100 | 1st Qu.: 38 | 1st Qu.: 5.9000 | 1st Qu.:1.000e+03 | 1st Qu.: 0.000 | 1st Qu.:2.000 | 1st Qu.: 19.0 | 1st Qu.:0.000000 | 1st Qu.:0.000000 | 1st Qu.:0.000000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.000000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.00000 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.0000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1:4950 | |
| Median :4.200 | Median : 2094 | Median : 14.5000 | Median :1.000e+05 | Median : 0.000 | Median :2.000 | Median : 76.0 | Median :0.000000 | Median :0.000000 | Median :0.000000 | Median :0.00000 | Median :0.00000 | Median :0.000000 | Median :0.0000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.000000 | Median :0.0000 | Median :0.00000 | Median :0.00000 | Median :0.0000 | Median :0.00000 | Median :0.000000 | Median :0.000000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.000000 | Median :0.00000 | Median :0.0000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :0.0000 | Median :0.00000 | Median :0.000000 | NA | |
| Mean :4.192 | Mean : 444153 | Mean : 21.2119 | Mean :1.546e+07 | Mean : 1.027 | Mean :2.465 | Mean : 259.7 | Mean :0.005996 | Mean :0.007841 | Mean :0.004889 | Mean :0.02131 | Mean :0.04244 | Mean :0.005535 | Mean :0.0357 | Mean :0.02159 | Mean :0.01439 | Mean :0.01375 | Mean :0.005904 | Mean :0.1819 | Mean :0.03376 | Mean :0.01172 | Mean :0.1055 | Mean :0.03146 | Mean :0.008118 | Mean :0.007841 | Mean :0.03524 | Mean :0.01264 | Mean :0.04271 | Mean :0.02611 | Mean :0.005535 | Mean :0.03616 | Mean :0.0309 | Mean :0.03911 | Mean :0.02399 | Mean :0.02721 | Mean :0.03542 | Mean :0.07777 | Mean :0.0238 | Mean :0.01614 | Mean :0.007565 | NA | |
| 3rd Qu.:4.500 | 3rd Qu.: 54776 | 3rd Qu.: 28.0000 | 3rd Qu.:5.000e+06 | 3rd Qu.: 0.000 | 3rd Qu.:2.000 | 3rd Qu.: 322.0 | 3rd Qu.:0.000000 | 3rd Qu.:0.000000 | 3rd Qu.:0.000000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.000000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.00000 | 3rd Qu.:0.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.0000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | NA | |
| Max. :5.000 | Max. :78158306 | Max. :100.0000 | Max. :1.000e+09 | Max. :400.000 | Max. :6.000 | Max. :3001.0 | Max. :1.000000 | Max. :1.000000 | Max. :1.000000 | Max. :1.00000 | Max. :1.00000 | Max. :1.000000 | Max. :1.0000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.000000 | Max. :1.0000 | Max. :1.00000 | Max. :1.00000 | Max. :1.0000 | Max. :1.00000 | Max. :1.000000 | Max. :1.000000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.000000 | Max. :1.00000 | Max. :1.0000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.00000 | Max. :1.0000 | Max. :1.00000 | Max. :1.000000 | NA |
data_apps$Installs_Category <- as.factor(data_apps$Installs_Category)
data_apps <- data_apps[, !names(data_apps) %in% c('Installs')]
We first start with the basic Logistic model.
glm_model <- glm(Installs_Category ~ ., data = data_apps, family = binomial)
summary(glm_model)
##
## Call:
## glm(formula = Installs_Category ~ ., family = binomial, data = data_apps)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3820514 0.6211700 0.615 0.53852
## Rating -0.6803505 0.0847476 -8.028 9.91e-16 ***
## Reviews 0.0006374 0.0000192 33.202 < 2e-16 ***
## Size -0.0005744 0.0030275 -0.190 0.84953
## Price -4.7944227 0.2228852 -21.511 < 2e-16 ***
## Content.Rating -0.1635661 0.0680993 -2.402 0.01631 *
## Last.Updated -0.0006128 0.0001470 -4.169 3.06e-05 ***
## catART_AND_DESIGN -0.4006310 0.7466491 -0.537 0.59156
## catAUTO_AND_VEHICLES -0.3219066 0.6526739 -0.493 0.62186
## catBEAUTY 0.4362883 0.6833897 0.638 0.52320
## catBOOKS_AND_REFERENCE -0.1883103 0.6018541 -0.313 0.75437
## catBUSINESS -0.7296793 0.5517898 -1.322 0.18604
## catCOMICS -0.7802165 0.7833083 -0.996 0.31922
## catCOMMUNICATION -0.6947830 0.6337572 -1.096 0.27295
## catDATING -0.3163013 0.5833206 -0.542 0.58765
## catEDUCATION 0.9985535 0.6030813 1.656 0.09777 .
## catENTERTAINMENT 0.2250374 0.8222913 0.274 0.78434
## catEVENTS -1.6690612 0.8893331 -1.877 0.06055 .
## catFAMILY -0.3178247 0.5048063 -0.630 0.52896
## catFINANCE -1.6174598 0.5894926 -2.744 0.00607 **
## catFOOD_AND_DRINK -0.2992787 0.6568855 -0.456 0.64868
## catGAME -0.3539908 0.5335639 -0.663 0.50705
## catHEALTH_AND_FITNESS -0.1707530 0.5602267 -0.305 0.76052
## catHOUSE_AND_HOME 0.7087588 0.6078017 1.166 0.24357
## catLIBRARIES_AND_DEMO 0.1163682 0.6447283 0.180 0.85677
## catLIFESTYLE -0.4210295 0.5434017 -0.775 0.43846
## catMAPS_AND_NAVIGATION -1.0259721 0.7273633 -1.411 0.15838
## catMEDICAL -1.4230374 0.5865690 -2.426 0.01526 *
## catNEWS_AND_MAGAZINES -0.9500342 0.6150160 -1.545 0.12241
## catPARENTING 0.3630993 0.6559063 0.554 0.57986
## catPERSONALIZATION 0.1339716 0.5607857 0.239 0.81118
## catPHOTOGRAPHY 0.7174853 0.5586400 1.284 0.19902
## catPRODUCTIVITY -0.4885764 0.5612522 -0.871 0.38402
## catSHOPPING -0.0205549 0.5949104 -0.035 0.97244
## catSOCIAL -0.4726360 0.6194859 -0.763 0.44549
## catSPORTS -0.9393387 0.5765341 -1.629 0.10325
## catTOOLS -0.2909637 0.5195893 -0.560 0.57549
## catTRAVEL_AND_LOCAL -0.0263625 0.5833597 -0.045 0.96396
## catVIDEO_PLAYERS 0.2862331 0.5872173 0.487 0.62595
## catWEATHER NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14945.8 on 10839 degrees of freedom
## Residual deviance: 2803.6 on 10801 degrees of freedom
## AIC: 2881.6
##
## Number of Fisher Scoring iterations: 15
Next, the stepwise selection is proceeded based on the logistic model above. It can be seen that the model below has AIC value of 2850, indicating the below model performs better than the above (which has AIC of 2881).
stepwise_model <- stats::step(glm_model, direction = "both", trace = 0)
summary(stepwise_model)
##
## Call:
## glm(formula = Installs_Category ~ Rating + Reviews + Price +
## Content.Rating + Last.Updated + catBUSINESS + catEDUCATION +
## catEVENTS + catFINANCE + catHOUSE_AND_HOME + catMAPS_AND_NAVIGATION +
## catMEDICAL + catNEWS_AND_MAGAZINES + catPHOTOGRAPHY + catSPORTS +
## catVIDEO_PLAYERS + catPERSONALIZATION + catBEAUTY + catPARENTING,
## family = binomial, data = data_apps)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.447e-02 3.582e-01 0.264 0.792003
## Rating -6.786e-01 8.303e-02 -8.172 3.03e-16 ***
## Reviews 6.376e-04 1.882e-05 33.874 < 2e-16 ***
## Price -4.798e+00 2.226e-01 -21.553 < 2e-16 ***
## Content.Rating -1.782e-01 5.885e-02 -3.028 0.002464 **
## Last.Updated -6.181e-04 1.406e-04 -4.395 1.11e-05 ***
## catBUSINESS -4.264e-01 2.615e-01 -1.630 0.103014
## catEDUCATION 1.301e+00 3.590e-01 3.624 0.000290 ***
## catEVENTS -1.368e+00 7.478e-01 -1.829 0.067352 .
## catFINANCE -1.319e+00 3.359e-01 -3.927 8.62e-05 ***
## catHOUSE_AND_HOME 1.013e+00 3.660e-01 2.767 0.005656 **
## catMAPS_AND_NAVIGATION -7.234e-01 5.415e-01 -1.336 0.181578
## catMEDICAL -1.124e+00 3.281e-01 -3.427 0.000610 ***
## catNEWS_AND_MAGAZINES -6.422e-01 3.757e-01 -1.710 0.087354 .
## catPHOTOGRAPHY 1.024e+00 2.778e-01 3.688 0.000226 ***
## catSPORTS -6.410e-01 3.113e-01 -2.059 0.039503 *
## catVIDEO_PLAYERS 5.932e-01 3.292e-01 1.802 0.071563 .
## catPERSONALIZATION 4.459e-01 2.820e-01 1.581 0.113860
## catBEAUTY 7.416e-01 4.795e-01 1.547 0.121909
## catPARENTING 6.601e-01 4.392e-01 1.503 0.132892
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14945.8 on 10839 degrees of freedom
## Residual deviance: 2810.1 on 10820 degrees of freedom
## AIC: 2850.1
##
## Number of Fisher Scoring iterations: 16
glm_varImp <- varImp(stepwise_model, scale = FALSE)
print(glm_varImp)
## Overall
## Rating 8.172170
## Reviews 33.874116
## Price 21.552835
## Content.Rating 3.027785
## Last.Updated 4.395380
## catBUSINESS 1.630414
## catEDUCATION 3.624470
## catEVENTS 1.829315
## catFINANCE 3.926529
## catHOUSE_AND_HOME 2.767071
## catMAPS_AND_NAVIGATION 1.335912
## catMEDICAL 3.427031
## catNEWS_AND_MAGAZINES 1.709524
## catPHOTOGRAPHY 3.687768
## catSPORTS 2.058906
## catVIDEO_PLAYERS 1.801890
## catPERSONALIZATION 1.581080
## catBEAUTY 1.546812
## catPARENTING 1.502794
#install.packages('vip')
library(vip)
# Plot the top 10 important features with the custom color
vip_plot<-vip(stepwise_model, num_features = 10, bar = TRUE)
vip_plot +
# Set the custom color for bars
geom_bar(stat = "identity", fill = "#365b6d") +
# Reversing the order of features for better visual flow
coord_flip() +
# Improving labels and text size
theme_minimal() +
theme(
axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hides the legend if not needed
) +
ggtitle("Top 10 Important Features - GLM Model")
glm_probs <- predict(stepwise_model, data_apps, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
# Generate confusion matrix
test_cm <- confusionMatrix(factor(glm_pred), factor(data_apps$Installs_Category))
print(test_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5763 370
## 1 127 4580
##
## Accuracy : 0.9542
## 95% CI : (0.95, 0.958)
## No Information Rate : 0.5434
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9072
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9784
## Specificity : 0.9253
## Pos Pred Value : 0.9397
## Neg Pred Value : 0.9730
## Prevalence : 0.5434
## Detection Rate : 0.5316
## Detection Prevalence : 0.5658
## Balanced Accuracy : 0.9518
##
## 'Positive' Class : 0
##
# Extract confusion matrix table
cm_table <- as.table(test_cm)
# Convert the table to a data frame for ggplot
cm_df <- as.data.frame(cm_table)
cm_df <- as.data.frame(test_cm$table)
cm_df$Percentage <- cm_df$Freq / sum(cm_df$Freq) * 100
ggplot(cm_df, aes(x = Prediction, y = Reference)) +
geom_tile(aes(fill = Percentage), color = "white") +
geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Freq)),
size = 4, color = "black") +
scale_fill_gradient2(low = "white", high = "#6c9286",
midpoint = min(cm_df$Percentage)) +
labs(title = "Confusion Matrix Heatmap",
subtitle = sprintf("Overall Accuracy: %.1f%%", test_cm$overall["Accuracy"] * 100)) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10)
)
precision <- test_cm$byClass["Pos Pred Value"] # Precision
recall <- test_cm$byClass["Sensitivity"] # Recall
f1_score <- test_cm$byClass["F1"] # F1 Score
cat("Precision: ", precision, "\n")
## Precision: 0.9396706
cat("Recall: ", recall, "\n")
## Recall: 0.978438
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.9586626
# Predict probabilities
glm_probs <- predict(stepwise_model, data_apps, type = "response")
# Generate ROC curve
roc_curve <- roc(data_apps$Installs_Category, glm_probs)
# Plot ROC curve
plot(roc_curve, main = "ROC Curve")
# AUC (Area Under the Curve)
auc_value <- auc(roc_curve)
cat("AUC: ", auc_value, "\n")
## AUC: 0.9922682
# Split the data into features and target
features <- c("Rating", "Reviews", "Size", "Price", "Content.Rating", "Last.Updated",
grep("^cat", names(data_apps), value = TRUE))
X <- data_apps[, features]
y <- data_apps$Installs_Category
# Split the data into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# Scale the features
preprocess_params <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preprocess_params, X_train)
X_test_scaled <- predict(preprocess_params, X_test)
# Train KNN model with cross-validation to find optimal k
k_values <- seq(1, 20, by = 2)
cv_results <- lapply(k_values, function(k) {
train_control <- trainControl(method = "cv", number = 5)
knn_model <- train(x = X_train_scaled,
y = y_train,
method = "knn",
tuneGrid = data.frame(k = k),
trControl = train_control)
return(knn_model$results$Accuracy)
})
# Plot k values vs accuracy
k_accuracy_df <- data.frame(k = k_values, accuracy = unlist(cv_results))
ggplot(k_accuracy_df, aes(x = k, y = accuracy)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "K Values vs Cross-Validation Accuracy",
x = "Number of Neighbors (k)",
y = "Accuracy") +
theme_minimal()
As we can see, the most optimal k is found by using CV and accuracy as metric, highest accuracy of about 78% is achieved.
optimal_k <- k_values[which.max(cv_results)]
final_knn <- knn3(X_train_scaled, y_train, k = optimal_k)
# Make predictions
y_pred_prob <- predict(final_knn, X_test_scaled, type = "prob")
y_pred <- factor(ifelse(y_pred_prob[,2] > 0.5, 1, 0), levels = c(0, 1))
y_test_numeric <- as.numeric(as.character(y_test))
# Calculate performance metrics
conf_matrix <- confusionMatrix(y_pred, y_test)
roc_curve <- roc(as.numeric(y_test_numeric), y_pred_prob[,2])
auc_score <- auc(roc_curve)
# Create performance summary table
performance_summary <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1 Score", "AUC"),
Value = c(
conf_matrix$overall["Accuracy"],
conf_matrix$byClass["Precision"],
conf_matrix$byClass["Recall"],
conf_matrix$byClass["F1"],
auc_score
)
)
# Display performance summary using kable
kable(performance_summary, caption = "KNN Model Performance Summary") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value | |
|---|---|---|
| Accuracy | Accuracy | 0.7610701 |
| Precision | Precision | 0.7857968 |
| Recall | Recall | 0.7702320 |
| F1 | F1 Score | 0.7779366 |
| AUC | 0.8324551 |
#Confusion matrix heat map
conf_matrix_df <- as.data.frame(conf_matrix$table)
conf_matrix_df$Percentage <- conf_matrix_df$Freq / sum(conf_matrix_df$Freq) * 100
ggplot(conf_matrix_df, aes(x = Prediction, y = Reference)) +
geom_tile(aes(fill = Percentage), color = "white") +
geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Freq)),
size = 4, color = "black") +
scale_fill_gradient2(low = "white", high = "#4A90E2",
midpoint = min(conf_matrix_df$Percentage)) +
labs(title = "Confusion Matrix Heatmap",
subtitle = sprintf("Overall Accuracy: %.1f%%", conf_matrix$overall["Accuracy"] * 100)) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10)
)
# Plot ROC curve
plot(roc_curve, main = "ROC Curve for KNN Model",
col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "gray")
legend("bottomright", legend = sprintf("AUC = %.3f", auc_score))
The AUC for the model is 0.83 which is good. But the performance metrics
such as accuracy, precision are anticipated to have an above 80%
score.
calc_importance <- function(feature) {
X_test_permuted <- X_test_scaled
X_test_permuted[,feature] <- sample(X_test_scaled[,feature])
pred_permuted <- predict(final_knn, X_test_permuted, type = "prob")
roc_permuted <- roc(as.numeric(y_test), pred_permuted[,2])
return(auc_score - auc(roc_permuted))
}
importance_scores <- sapply(features, calc_importance)
importance_df <- data.frame(
Feature = features,
Importance = importance_scores
)
importance_df <- importance_df[order(-importance_df$Importance),]
# Plot feature importance
ggplot(importance_df[1:39,], aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Feature Importance (KNN Model)",
x = "Feature",
y = "Importance Score") +
theme_minimal()
The top 5 features contributing to the model are Size, Rating, Last updated, Content Rating and Reviews.
#————————————————————————-
# Remove the Installs and Installs numerical columns
data <- data_apps[, !colnames(data_apps) %in% c("Installs")]
# Split the data into training and testing sets
set.seed(123) # Ensure reproducibility
trainIndex <- createDataPartition(data$Installs_Category, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData<- data[-trainIndex, ]
# Fit the decision tree model
set.seed(42)
tree_model <- rpart(
Installs_Category ~ . ,
data = trainData,
method = "class"
)
# Plot the decision tree
rpart.plot(tree_model, main = "Decision Tree for Predicting Installs Category")
# Predict on training and test datasets
train_predictions <- predict(tree_model, trainData, type = "class")
test_predictions <- predict(tree_model, testData, type = "class")
# Calculate accuracy
train_accuracy <- sum(train_predictions == trainData$Installs_Category) / nrow(trainData)
test_accuracy <- sum(test_predictions == testData$Installs_Category) / nrow(testData)
# Print accuracy results
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy: 0.9519142
cat("Test Accuracy: ", test_accuracy, "\n")
## Test Accuracy: 0.9488007
Why shift to Random Forest? High Dimensionality: With 41 variables,
random forest handles many features better and can identify the most
important ones.
Feature Importance: Random forest provides a ranking
of feature importance, helping us understand which variables influence
the Installs_Category.
Accuracy: Random forest generally has better
predictive accuracy for larger and more complex datasets.
In this analysis, we employ a Random Forest model to predict the number of installs based on the top 5 app categories. The Random Forest algorithm is a robust ensemble learning method that builds multiple decision trees and combines their predictions to improve accuracy and reduce overfitting.
# Train the random forest model
set.seed(123)
rf_model <- randomForest(Installs_Category ~ .,
data = trainData,
ntree = 500, # Number of trees
mtry = 4, # Number of predictors sampled at each split
importance = TRUE) # Enable importance calculation
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = Installs_Category ~ ., data = trainData, ntree = 500, mtry = 4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 4.35%
## Confusion matrix:
## 0 1 class.error
## 0 4547 165 0.03501698
## 1 212 3748 0.05353535
# Plot the Random Forest model
plot(rf_model, main = "Random Forest Model Performance")
# Add a legend to explain the colors
legend("topright",
legend = c("OOB Error", "Class 1 Error", "Class 2 Error"),
col = c("black", "red", "green"),
lty = 1,
cex = 0.8)
# Predictions on the training set
train_predictions <- predict(rf_model, trainData)
# Predictions on the testing set
test_predictions <- predict(rf_model, testData)
# Confusion Matrix for Training Data
train_cm <- confusionMatrix(train_predictions, trainData$Installs_Category)
print(train_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4617 122
## 1 95 3838
##
## Accuracy : 0.975
## 95% CI : (0.9715, 0.9782)
## No Information Rate : 0.5434
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9495
##
## Mcnemar's Test P-Value : 0.07756
##
## Sensitivity : 0.9798
## Specificity : 0.9692
## Pos Pred Value : 0.9743
## Neg Pred Value : 0.9758
## Prevalence : 0.5434
## Detection Rate : 0.5324
## Detection Prevalence : 0.5465
## Balanced Accuracy : 0.9745
##
## 'Positive' Class : 0
##
# Confusion Matrix for Testing Data
test_cm <- confusionMatrix(test_predictions, testData$Installs_Category)
print(test_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1132 53
## 1 46 937
##
## Accuracy : 0.9543
## 95% CI : (0.9447, 0.9627)
## No Information Rate : 0.5434
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9079
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9610
## Specificity : 0.9465
## Pos Pred Value : 0.9553
## Neg Pred Value : 0.9532
## Prevalence : 0.5434
## Detection Rate : 0.5221
## Detection Prevalence : 0.5466
## Balanced Accuracy : 0.9537
##
## 'Positive' Class : 0
##
# Performance on Training Data
train_predictions <- predict(rf_model, newdata = trainData)
train_conf_matrix <- table(train_predictions, trainData$Installs_Category)
train_accuracy <- sum(diag(train_conf_matrix)) / sum(train_conf_matrix)
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy: 0.9749769
# Performance on Testing Data
test_predictions <- predict(rf_model, newdata = testData)
test_conf_matrix <- table(test_predictions, testData$Installs_Category)
test_accuracy <- sum(diag(test_conf_matrix)) / sum(test_conf_matrix)
cat("Testing Accuracy: ", test_accuracy, "\n")
## Testing Accuracy: 0.9543358
# OOB Error from the model
oob_error <- rf_model$err.rate[500, "OOB"]
oob_accuracy <- 1 - oob_error
cat("OOB Accuracy: ", oob_accuracy, "\n")
## OOB Accuracy: 0.9565268
# Compare Results
comparison <- data.frame(
Dataset = c("Training", "Testing", "OOB"),
Accuracy = c(train_accuracy, test_accuracy, oob_accuracy)
)
print(comparison)
## Dataset Accuracy
## 1 Training 0.9749769
## 2 Testing 0.9543358
## 3 OOB 0.9565268
# Install and load pROC package if not already installed
if (!require("pROC")) install.packages("pROC", dependencies = TRUE)
# Get predicted probabilities for the positive class (class '1')
rf_prob <- predict(rf_model, testData, type = "prob")[, 2] # Probabilities for class '1'
# Compute ROC curve and AUC
roc_curve <- roc(testData$Installs_Category, rf_prob)
# Plot the ROC curve
plot(roc_curve, col = "blue", lwd = 2, main = "ROC Curve for Random Forest")
abline(a = 0, b = 1, lty = 2, col = "gray") # Diagonal reference line
# Display the AUC value
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
## AUC: 0.9881922
# Variable importance
importance(rf_model)
## 0 1 MeanDecreaseAccuracy
## Rating 15.1142447 19.94723199 22.5997876
## Reviews 90.0067853 100.26035812 101.9665752
## Size 11.8994820 17.27289181 20.2294097
## Price 37.1396369 41.59484288 45.7099624
## Content.Rating 10.1721956 10.05626857 13.8741454
## Last.Updated 17.5212482 16.76819395 24.0306055
## catART_AND_DESIGN -1.7119601 -0.45450740 -1.6004086
## catAUTO_AND_VEHICLES 2.1287644 7.30700185 7.0864851
## catBEAUTY -1.6937161 4.43340758 2.3915006
## catBOOKS_AND_REFERENCE -1.4109002 2.40811737 0.9254431
## catBUSINESS -2.9786640 11.46650484 10.2307574
## catCOMICS 2.6287264 2.40038960 3.7801295
## catCOMMUNICATION 0.3875751 -0.94118112 -0.3435919
## catDATING 4.8477990 -0.39074922 3.4658472
## catEDUCATION 12.0764045 7.85370158 12.9867373
## catENTERTAINMENT 14.4382698 1.32379128 14.7495594
## catEVENTS 6.6402383 10.65844758 11.2020113
## catFAMILY -5.9215812 6.69886354 2.4785553
## catFINANCE 0.8756648 5.90079226 5.1381823
## catFOOD_AND_DRINK -4.4615018 0.43503127 -3.1082568
## catGAME 12.3678276 0.23835409 12.5083958
## catHEALTH_AND_FITNESS 1.4680166 3.83750346 3.7565491
## catHOUSE_AND_HOME -1.2091642 2.71761141 1.1205213
## catLIBRARIES_AND_DEMO 1.0719952 7.41387595 7.0258040
## catLIFESTYLE -1.7178046 6.10759331 5.2129387
## catMAPS_AND_NAVIGATION -3.7395597 1.04149969 -1.2763125
## catMEDICAL 6.6464141 20.04470310 20.3154344
## catNEWS_AND_MAGAZINES 1.5084049 6.83054946 6.2625470
## catPARENTING -1.5158336 4.68584942 2.3270685
## catPERSONALIZATION -0.1349033 3.41111809 2.6745840
## catPHOTOGRAPHY 14.3055236 5.00877316 15.4154186
## catPRODUCTIVITY -4.6589723 1.97159358 -1.0439910
## catSHOPPING 7.9015188 1.67789040 8.5681436
## catSOCIAL 3.0757349 -0.93364078 2.3564318
## catSPORTS 1.6490382 0.37976680 1.3831812
## catTOOLS -3.8679728 0.19000491 -1.6527781
## catTRAVEL_AND_LOCAL -2.8476202 0.61812514 -1.7685359
## catVIDEO_PLAYERS -0.7382980 -0.01421662 -0.5999692
## catWEATHER 3.2343068 -2.54911142 1.2265776
## MeanDecreaseGini
## Rating 258.205076
## Reviews 2341.076881
## Size 202.836810
## Price 105.230783
## Content.Rating 32.068778
## Last.Updated 276.565693
## catART_AND_DESIGN 2.632787
## catAUTO_AND_VEHICLES 4.148712
## catBEAUTY 2.755840
## catBOOKS_AND_REFERENCE 4.978577
## catBUSINESS 12.245818
## catCOMICS 2.994492
## catCOMMUNICATION 4.271465
## catDATING 5.816032
## catEDUCATION 9.973986
## catENTERTAINMENT 13.860234
## catEVENTS 4.870598
## catFAMILY 13.905954
## catFINANCE 5.811543
## catFOOD_AND_DRINK 2.055011
## catGAME 40.202925
## catHEALTH_AND_FITNESS 5.479474
## catHOUSE_AND_HOME 2.958173
## catLIBRARIES_AND_DEMO 4.038455
## catLIFESTYLE 6.265143
## catMAPS_AND_NAVIGATION 1.791088
## catMEDICAL 43.762990
## catNEWS_AND_MAGAZINES 4.494912
## catPARENTING 2.847382
## catPERSONALIZATION 4.814103
## catPHOTOGRAPHY 18.729702
## catPRODUCTIVITY 4.511257
## catSHOPPING 7.480957
## catSOCIAL 4.087498
## catSPORTS 3.295354
## catTOOLS 7.597817
## catTRAVEL_AND_LOCAL 2.957056
## catVIDEO_PLAYERS 3.473253
## catWEATHER 3.001699
# Plot variable importance
varImpPlot(rf_model)
#### Visualization for Feature Importance
# Extract importance values
importance_values <- importance(rf_model)
importance_df <- data.frame(
Feature = rownames(importance_values),
MeanDecreaseAccuracy = importance_values[, "MeanDecreaseAccuracy"],
MeanDecreaseGini = importance_values[, "MeanDecreaseGini"]
)
# Plot Mean Decrease in Accuracy
accuracy_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Feature Importance (Mean Decrease in Accuracy)",
x = "Features",
y = "Importance"
) +
theme_minimal() +
theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))
# Plot the accuracy plot
print(accuracy_plot)
# Save the plot with larger dimensions
# ggsave("feature_importance_accuracy_large.png", plot = accuracy_plot, width = 12, height = 10, dpi = 300)
# Plot Mean Decrease in Gini
gini_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "lightgreen") +
coord_flip() +
labs(
title = "Feature Importance (Mean Decrease in Gini)",
x = "Features",
y = "Importance"
) +
theme_minimal() +
theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))
# Plot the gini index plot
print(gini_plot)
# Save the plot with larger dimensions
# ggsave("feature_importance_gini_large.png", plot = gini_plot, width = 12, height = 10, dpi = 300)
# Separate features (X) and target (y)
#X <- data_final %>% select(-Installs, -Success)
X <- data_apps %>% select(-Installs_Category) # Exclude the target variable
y <- data_apps$Installs_Category # Extract the target variable
table(y)
## y
## 0 1
## 5890 4950
#Split into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
# Define X_train, X_test, y_train, y_test
X_train <- X[train_index, ] %>% mutate(across(everything(), as.numeric))
X_test <- X[-train_index, ] %>% mutate(across(everything(), as.numeric))
y_train <- as.numeric(as.character(y[train_index])) # Convert to numeric
y_test <- as.numeric(as.character(y[-train_index])) # Convert to numeric
# Convert data to matrix for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)
data_apps$Installs_Category <- factor(data_apps$Installs_Category, levels = c(0, 1))
# 4. --------------------------------------
params <- list(
objective = "binary:logistic", # Binary classification
eval_metric = "logloss",
max_depth = 6,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
# Train the model
set.seed(42)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 10,
verbose = 1
)
## [1] train-logloss:0.609051 test-logloss:0.610845
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
##
## [2] train-logloss:0.540069 test-logloss:0.543209
## [3] train-logloss:0.482265 test-logloss:0.486837
## [4] train-logloss:0.434053 test-logloss:0.440368
## [5] train-logloss:0.392603 test-logloss:0.399988
## [6] train-logloss:0.357633 test-logloss:0.366472
## [7] train-logloss:0.326262 test-logloss:0.335497
## [8] train-logloss:0.299307 test-logloss:0.309752
## [9] train-logloss:0.275127 test-logloss:0.286471
## [10] train-logloss:0.254064 test-logloss:0.266038
## [11] train-logloss:0.246028 test-logloss:0.258059
## [12] train-logloss:0.228166 test-logloss:0.241110
## [13] train-logloss:0.212535 test-logloss:0.226162
## [14] train-logloss:0.199030 test-logloss:0.213673
## [15] train-logloss:0.186369 test-logloss:0.202031
## [16] train-logloss:0.175329 test-logloss:0.191756
## [17] train-logloss:0.165163 test-logloss:0.182054
## [18] train-logloss:0.155909 test-logloss:0.173148
## [19] train-logloss:0.147937 test-logloss:0.165724
## [20] train-logloss:0.140798 test-logloss:0.159106
## [21] train-logloss:0.134419 test-logloss:0.153387
## [22] train-logloss:0.128485 test-logloss:0.147681
## [23] train-logloss:0.126779 test-logloss:0.146159
## [24] train-logloss:0.121702 test-logloss:0.141489
## [25] train-logloss:0.120300 test-logloss:0.140252
## [26] train-logloss:0.115684 test-logloss:0.136209
## [27] train-logloss:0.111563 test-logloss:0.132826
## [28] train-logloss:0.107759 test-logloss:0.129496
## [29] train-logloss:0.104358 test-logloss:0.126577
## [30] train-logloss:0.101191 test-logloss:0.124088
## [31] train-logloss:0.098429 test-logloss:0.121812
## [32] train-logloss:0.096048 test-logloss:0.119849
## [33] train-logloss:0.093736 test-logloss:0.118078
## [34] train-logloss:0.091855 test-logloss:0.116222
## [35] train-logloss:0.089978 test-logloss:0.114691
## [36] train-logloss:0.088334 test-logloss:0.113315
## [37] train-logloss:0.086624 test-logloss:0.111863
## [38] train-logloss:0.085822 test-logloss:0.111504
## [39] train-logloss:0.084222 test-logloss:0.110214
## [40] train-logloss:0.083036 test-logloss:0.109247
## [41] train-logloss:0.081971 test-logloss:0.108476
## [42] train-logloss:0.080807 test-logloss:0.107580
## [43] train-logloss:0.079924 test-logloss:0.106761
## [44] train-logloss:0.078889 test-logloss:0.106185
## [45] train-logloss:0.077890 test-logloss:0.105556
## [46] train-logloss:0.077019 test-logloss:0.104972
## [47] train-logloss:0.076692 test-logloss:0.104888
## [48] train-logloss:0.076087 test-logloss:0.104560
## [49] train-logloss:0.075228 test-logloss:0.103823
## [50] train-logloss:0.074563 test-logloss:0.103500
## [51] train-logloss:0.073904 test-logloss:0.103389
## [52] train-logloss:0.073342 test-logloss:0.103047
## [53] train-logloss:0.072766 test-logloss:0.102632
## [54] train-logloss:0.072190 test-logloss:0.102346
## [55] train-logloss:0.071809 test-logloss:0.102039
## [56] train-logloss:0.071340 test-logloss:0.101941
## [57] train-logloss:0.070922 test-logloss:0.101906
## [58] train-logloss:0.070537 test-logloss:0.101871
## [59] train-logloss:0.070109 test-logloss:0.101728
## [60] train-logloss:0.069795 test-logloss:0.101409
## [61] train-logloss:0.069435 test-logloss:0.101432
## [62] train-logloss:0.069234 test-logloss:0.101196
## [63] train-logloss:0.068961 test-logloss:0.100983
## [64] train-logloss:0.068728 test-logloss:0.101066
## [65] train-logloss:0.068271 test-logloss:0.101246
## [66] train-logloss:0.067911 test-logloss:0.101329
## [67] train-logloss:0.067658 test-logloss:0.101267
## [68] train-logloss:0.067465 test-logloss:0.101118
## [69] train-logloss:0.066696 test-logloss:0.101152
## [70] train-logloss:0.066292 test-logloss:0.101252
## [71] train-logloss:0.066050 test-logloss:0.101360
## [72] train-logloss:0.065654 test-logloss:0.101406
## [73] train-logloss:0.065511 test-logloss:0.101405
## Stopping. Best iteration:
## [63] train-logloss:0.068961 test-logloss:0.100983
#
# Make predictions
y_pred <- predict(xgb_model, dtest)
y_pred_class <- ifelse(y_pred > 0.5, 1, 0)
# Confusion Matrix
y_pred_class <- factor(y_pred_class, levels = c(0, 1))
y_test <- factor(y_test, levels = c(0, 1))
conf_matrix <- confusionMatrix(y_pred_class, y_test)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1698 67
## 1 69 1418
##
## Accuracy : 0.9582
## 95% CI : (0.9507, 0.9648)
## No Information Rate : 0.5434
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9157
##
## Mcnemar's Test P-Value : 0.9317
##
## Sensitivity : 0.9610
## Specificity : 0.9549
## Pos Pred Value : 0.9620
## Neg Pred Value : 0.9536
## Prevalence : 0.5434
## Detection Rate : 0.5221
## Detection Prevalence : 0.5427
## Balanced Accuracy : 0.9579
##
## 'Positive' Class : 0
##
Confusion Matrix:The model struggles significantly with distinguishing between “Low Installs” and “High Installs,” especially misclassifying many “Low Installs” as “High Installs” (1414 cases). Precision for “High Installs” (proportion of correct “High Installs” predictions) and recall for “Low Installs” (proportion of correctly identified “Low Installs”) are low.
# AUC and ROC Curve
roc_obj <- roc(as.numeric(as.character(y_test)), y_pred)
auc_value <- auc(roc_obj)
cat("AUC:", auc_value, "\n")
## AUC: 0.993891
# Plot ROC Curve
plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")
The ROC curve illustrates the model’s performance by plotting Sensitivity (True Positive Rate) against 1 - Specificity (False Positive Rate) across thresholds. The closer the curve approaches the top-left corner, the better the model distinguishes between “Low Installs” and “High Installs.” A high AUC value indicates strong classification performance.
importance_matrix <- xgb.importance(feature_names = colnames(X_train), model = xgb_model)
xgb.plot.importance(importance_matrix, top_n = 10, main = "Feature Importance")
# 7. Save Model ---------------------------------------------------------
xgb.save(xgb_model, "xgb_app_success.model")
## [1] TRUE
# Summary
cat("Gradient Boosting achieved an accuracy of", conf_matrix$overall["Accuracy"],
"and AUC of", auc_value, "\n")
## Gradient Boosting achieved an accuracy of 0.9581796 and AUC of 0.993891
“Reviews” has the highest influence, followed by “Rating” and “Last.Updated.” Features like “Price” and “Size” also contribute, albeit minimally. Other categorical variables, such as “catFAMILY” and “catMEDICAL,” have negligible impact on the model’s predictions.
# Convert target variable to a factor
# Split the data into features and target
y <- as.factor(data_apps$Installs_Category)
X <- data_apps[, !names(data_apps) %in% c('Installs_Category')]
# Split data into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.75, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]
library(plotly)
# Prepare the plot data
plot_data <- data.frame(X_train, Class = as.factor(y_train))
# Create 3D scatter plot for features 1, 2, and 3
plot_1_2_3 <- plot_ly(data = plot_data,
x = ~X_train[, 1],
y = ~X_train[, 2],
z = ~X_train[, 3],
color = ~Class,
colors = c("red", "blue"), # Set colors for classes (0 and 1)
type = 'scatter3d',
mode = 'markers') %>%
layout(title = "3D Scatter Plot: Feature 1 vs Feature 2 vs Feature 3",
scene = list(xaxis = list(title = colnames(X_train)[1]),
yaxis = list(title = colnames(X_train)[2]),
zaxis = list(title = colnames(X_train)[3])))
# Create 3D scatter plot for features 4, 5, and 6
plot_4_5_6 <- plot_ly(data = plot_data,
x = ~X_train[, 4],
y = ~X_train[, 5],
z = ~X_train[, 6],
color = ~Class,
colors = c("red", "blue"), # Set colors for classes (0 and 1)
type = 'scatter3d',
mode = 'markers') %>%
layout(title = "3D Scatter Plot: Feature 4 vs Feature 5 vs Feature 6",
scene = list(xaxis = list(title = colnames(X_train)[4]),
yaxis = list(title = colnames(X_train)[5]),
zaxis = list(title = colnames(X_train)[6])))
# Show plots
plot_1_2_3
plot_4_5_6
As we can see we cannot decide if the boundary is linear or non-linear hence, lets make two models linear and non-linear SVM to check which one is a better fit.
# Load necessary libraries
library(e1071)
library(caret)
# Assuming you have already defined X_train, y_train, X_test, y_test
# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))
# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)
# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100),
sigma = c(0.5, 1))
# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
method = "svmRadial",
tuneGrid = tune_grid,
trControl = train_control,scaled = TRUE)
# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel
##
## 8131 samples
## 39 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 0.1 0.5 0.7152887 0.4135606
## 0.1 1.0 0.7103702 0.3989988
## 1.0 0.5 0.7653439 0.5245875
## 1.0 1.0 0.7691565 0.5316017
## 10.0 0.5 0.7864964 0.5679965
## 10.0 1.0 0.7775180 0.5503263
## 100.0 0.5 0.7925222 0.5808094
## 100.0 1.0 0.7809624 0.5579036
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5 and C = 100.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
## sigma C
## 7 0.5 100
As seen for the training set the best accuracy is achieved when C = 100 and gamma is 0.5
# Load necessary libraries
library(e1071)
library(caret)
# Assuming you have already defined X_train, y_train, X_test, y_test
# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))
# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)
# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100, 1000))
# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
method = "svmLinear",
tuneGrid = tune_grid,
trControl = train_control,scaled = TRUE)
# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Linear Kernel
##
## 8131 samples
## 39 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1e-01 0.7422242 0.4695042
## 1e+00 0.7841613 0.5544567
## 1e+01 0.8463913 0.6827210
## 1e+02 0.9097293 0.8154933
## 1e+03 0.9484710 0.8954868
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1000.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
## C
## 5 1000
For linear model it could be seen that at C = 100, we attain an accuracy of 94 percent which suggests that its better to assume that the y was linearly seperable. Hence, now lets find the accuracy, ROC, AUC score of the test data.
# Load necessary libraries
library(e1071)
library(ggplot2)
# Assuming you have already defined X_train, y_train, X_test, y_test
# Combine training and test sets into data frames
train_data <- as.data.frame(cbind(X_train, y_train))
test_data <- as.data.frame(X_test)
# Initialize vectors to store MSE metrics
C_values <- c(0.1, 1, 10, 100, 1000, 100000, 500000)
mse_values <- c()
# Loop through different values of C
for (C in C_values) {
# Train the SVM model
svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = C, scale = TRUE)
# Predict on the test set
predictions <- predict(svm_model, newdata = test_data)
# Convert predictions and actual test values to numeric for MSE calculation
predictions_numeric <- as.numeric(as.character(predictions))
y_test_numeric <- as.numeric(as.character(y_test))
# Calculate Mean Squared Error (MSE)
mse <- mean((y_test_numeric - predictions_numeric)^2)
mse_values <- c(mse_values, mse)
}
# Combine results into a data frame for plotting
results <- data.frame(
C = C_values,
MSE = mse_values
)
# Plotting MSE values
ggplot(results, aes(x = factor(C), y = MSE)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "MSE for Different C Values",
x = "C Values",
y = "Mean Squared Error (MSE)") +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold")
) +
scale_x_discrete(labels = function(x) paste0("C = ", x)) +
geom_text(aes(label = round(MSE, 2)), vjust = -0.5) # Add value labels above points
The analysis of bias, variance, and mean squared error (MSE) across
different values of C shows: MSE dip: At C = 1000, the MSE is the
minimum. Low Bias and Variance: The variance seems to be little higher
which is expected for large C but it is not much high compared to 10,
100.
Hence, better to select C = 1000 for the SVM model to achieve an optimal balance between bias, variance, and MSE.
# Load necessary libraries
library(e1071)
library(pROC) # For ROC and AUC
# Assuming you have already defined X_train, y_train, X_test, y_test
# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))
# Fit the SVM model with linear kernel
svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = 1000, decision.values = TRUE,scaled = TRUE)
# Step 1: Make predictions on the test set
predictions <- predict(svm_model, newdata = as.data.frame(X_test))
# Step 2: Create confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = y_test)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 1441 114
## 1 31 1123
confusion_matrix_caret <- confusionMatrix(confusion_matrix)
# Step 2: Extract Precision, Recall, and F1 Score
precision <- confusion_matrix_caret$byClass['Precision']
recall <- confusion_matrix_caret$byClass['Recall']
f1_score <- confusion_matrix_caret$byClass['F1']
# Display the metrics
cat("Precision:", precision, "\n")
## Precision: 0.9266881
cat("Recall:", recall, "\n")
## Recall: 0.9789402
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.9520978
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9464747
# Step 4: Get decision values for ROC curve
fitted <- attributes(predict(svm_model, newdata = as.data.frame(X_test), decision.values = TRUE))$decision.values
# Step 5: Generate ROC plot for the test set
roc_curve <- roc(y_test, -fitted) # Note: Use negative for class labeling
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve for Test Data")
# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), bty = "n")
The model achieved an impressive accuracy of 94% and a high AUC of 0.99, indicating excellent performance in classification. Additionally, the high precision, recall, and F1-score values reflect the model’s capability to accurately classify both class 0 and class 1. These results suggest that the model is highly effective in distinguishing between the two classes, making it reliable for practical applications.
# Assuming you have already trained your SVM model (svm_model) using e1071
# Get coefficients from the SVM model
coefficients <- as.vector(svm_model$coefs) %*% svm_model$SV
# Get the intercept term
intercept <- svm_model$rho
# Combine coefficients and intercept into a single vector
all_coefficients <- c(intercept, coefficients)
# Print coefficients
cat("Coefficients (including intercept):\n")
## Coefficients (including intercept):
print(all_coefficients)
## [1] 8.414745e+01 6.496813e-02 -5.874921e+02 3.056034e-02 1.983059e+01
## [6] 1.000746e-01 2.364744e-01 -1.865601e-10 2.994796e-10 6.606222e-03
## [11] 2.135607e-10 -1.592937e-02 1.581999e-02 4.141880e-03 -2.317861e-02
## [16] -2.317393e-02 -1.273508e-02 4.583349e-02 -3.041952e-02 1.150636e-01
## [21] 4.433490e-02 -2.742884e-02 9.080968e-03 -7.041609e-03 3.055296e-02
## [26] -3.760284e-02 3.020458e-02 5.573072e-02 4.447850e-02 -4.367848e-10
## [31] -1.558534e-02 -6.564926e-03 -6.024834e-10 -3.561368e-02 1.181121e-03
## [36] 1.483217e-02 -4.697419e-02 -7.080146e-04 3.855689e-10 -8.795659e-03
# Check the number of coefficients
num_coefficients <- length(all_coefficients)
cat("Number of Coefficients (including intercept):", num_coefficients, "\n")
## Number of Coefficients (including intercept): 40
# Get feature names
feature_names <- colnames(X_train)
# Create a named vector for coefficients with feature names
named_coefficients <- setNames(coefficients, feature_names)
# Print named coefficients
cat("Feature Coefficients:\n")
## Feature Coefficients:
print(named_coefficients)
## Rating Reviews Size Price Content.Rating Last.Updated
## [1,] 0.06496813 -587.4921 0.03056034 19.83059 0.1000746 0.2364744
## catART_AND_DESIGN catAUTO_AND_VEHICLES catBEAUTY catBOOKS_AND_REFERENCE
## [1,] -1.865601e-10 2.994796e-10 0.006606222 2.135607e-10
## catBUSINESS catCOMICS catCOMMUNICATION catDATING catEDUCATION
## [1,] -0.01592937 0.01581999 0.00414188 -0.02317861 -0.02317393
## catENTERTAINMENT catEVENTS catFAMILY catFINANCE catFOOD_AND_DRINK
## [1,] -0.01273508 0.04583349 -0.03041952 0.1150636 0.0443349
## catGAME catHEALTH_AND_FITNESS catHOUSE_AND_HOME catLIBRARIES_AND_DEMO
## [1,] -0.02742884 0.009080968 -0.007041609 0.03055296
## catLIFESTYLE catMAPS_AND_NAVIGATION catMEDICAL catNEWS_AND_MAGAZINES
## [1,] -0.03760284 0.03020458 0.05573072 0.0444785
## catPARENTING catPERSONALIZATION catPHOTOGRAPHY catPRODUCTIVITY
## [1,] -4.367848e-10 -0.01558534 -0.006564926 -6.024834e-10
## catSHOPPING catSOCIAL catSPORTS catTOOLS catTRAVEL_AND_LOCAL
## [1,] -0.03561368 0.001181121 0.01483217 -0.04697419 -0.0007080146
## catVIDEO_PLAYERS catWEATHER
## [1,] 3.855689e-10 -0.008795659
## attr(,"names")
## [1] "Rating" "Reviews" "Size"
## [4] "Price" "Content.Rating" "Last.Updated"
## [7] "catART_AND_DESIGN" "catAUTO_AND_VEHICLES" "catBEAUTY"
## [10] "catBOOKS_AND_REFERENCE" "catBUSINESS" "catCOMICS"
## [13] "catCOMMUNICATION" "catDATING" "catEDUCATION"
## [16] "catENTERTAINMENT" "catEVENTS" "catFAMILY"
## [19] "catFINANCE" "catFOOD_AND_DRINK" "catGAME"
## [22] "catHEALTH_AND_FITNESS" "catHOUSE_AND_HOME" "catLIBRARIES_AND_DEMO"
## [25] "catLIFESTYLE" "catMAPS_AND_NAVIGATION" "catMEDICAL"
## [28] "catNEWS_AND_MAGAZINES" "catPARENTING" "catPERSONALIZATION"
## [31] "catPHOTOGRAPHY" "catPRODUCTIVITY" "catSHOPPING"
## [34] "catSOCIAL" "catSPORTS" "catTOOLS"
## [37] "catTRAVEL_AND_LOCAL" "catVIDEO_PLAYERS" "catWEATHER"
# Sort coefficients by absolute value for feature importance
sorted_coefficients <- sort(abs(named_coefficients), decreasing = TRUE)
# Print sorted feature importance
cat("Sorted Feature Importance:\n")
## Sorted Feature Importance:
print(sorted_coefficients)
## Reviews Price Last.Updated
## 5.874921e+02 1.983059e+01 2.364744e-01
## catFINANCE Content.Rating Rating
## 1.150636e-01 1.000746e-01 6.496813e-02
## catMEDICAL catTOOLS catEVENTS
## 5.573072e-02 4.697419e-02 4.583349e-02
## catNEWS_AND_MAGAZINES catFOOD_AND_DRINK catLIFESTYLE
## 4.447850e-02 4.433490e-02 3.760284e-02
## catSHOPPING Size catLIBRARIES_AND_DEMO
## 3.561368e-02 3.056034e-02 3.055296e-02
## catFAMILY catMAPS_AND_NAVIGATION catGAME
## 3.041952e-02 3.020458e-02 2.742884e-02
## catDATING catEDUCATION catBUSINESS
## 2.317861e-02 2.317393e-02 1.592937e-02
## catCOMICS catPERSONALIZATION catSPORTS
## 1.581999e-02 1.558534e-02 1.483217e-02
## catENTERTAINMENT catHEALTH_AND_FITNESS catWEATHER
## 1.273508e-02 9.080968e-03 8.795659e-03
## catHOUSE_AND_HOME catBEAUTY catPHOTOGRAPHY
## 7.041609e-03 6.606222e-03 6.564926e-03
## catCOMMUNICATION catSOCIAL catTRAVEL_AND_LOCAL
## 4.141880e-03 1.181121e-03 7.080146e-04
## catPRODUCTIVITY catPARENTING catVIDEO_PLAYERS
## 6.024834e-10 4.367848e-10 3.855689e-10
## catAUTO_AND_VEHICLES catBOOKS_AND_REFERENCE catART_AND_DESIGN
## 2.994796e-10 2.135607e-10 1.865601e-10
top_coef = head(sorted_coefficients,15)
# Optional: Visualize feature importance
barplot(
top_coef,
main = "Feature Importance from SVM Coefficients",
xlab = "Features",
col = "steelblue",
las = 2,
cex.names = 0.3,# Adjust name size if necessary
horiz = TRUE
)
The analysis shows that Reviews, Price, and Lastupdated are the top features influencing the model’s performance. Additionally, the leading app categories—Finance, Medical, Tools, Events, and News and Mangazines —suggest that focusing on these areas can enhance the chances of app success.